The following steps prepare the dataset for the modeling part. The actual feature analysis was done in another kernel.
Loading Data
library(data.table) # database-like tables
library(Matrix) # sparse matrices
library(ggplot2) # general plots
library(mice) # imputation
library(xgboost) # xtreme gradient boosting
library(crossval) # doing CV
library(ICEbox) # partial dependency plots
test <- data.table(read.csv("../input/test.csv"))
train <- data.table(read.csv("../input/train.csv"))
test$SalePrice <- NaN
full <- rbind(train, test)
full$label <- rep(c("train", "test"), c(nrow(train), nrow(test)))
Feature Attributes
# types
nomVars <- c("MSZoning", "Street", "Alley", "LandContour", "LotConfig",
"Neighborhood", "Condition1", "Condition2", "HouseStyle",
"RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
"MasVnrType", "Foundation", "Heating", "CentralAir", "Electrical",
"GarageType", "PavedDrive", "MiscFeature", "SaleType",
"SaleCondition")
ordVars <- c("MSSubClass", "LotShape", "Utilities", "LandSlope", "BldgType",
"OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual",
"BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
"HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
"GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence",
"MoSold")
discVars <- c("YearBuilt", "YearRemodAdd", "BsmtFullBath", "BsmtHalfBath",
"FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
"TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars",
"YrSold")
contVars <- c("LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
"TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF",
"EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea",
"MiscVal", "LotFrontage")
featAttr <- data.table(
name = colnames(test)[-1],
type = "nominal"
)
featAttr[name %in% ordVars, type := "ordinal"]
featAttr[name %in% discVars, type := "discrete"]
featAttr[name %in% contVars, type := "continuous"]
# special values
full[is.na(Alley), Alley := "noAccess"]
full[is.na(BsmtQual), BsmtQual := "noBasement"]
full[is.na(BsmtCond), BsmtCond := "noBasement"]
full[is.na(BsmtExposure), BsmtExposure := "noBasement"]
full[is.na(BsmtFinType1), BsmtFinType1 := "noBasement"]
full[is.na(BsmtFinType2), BsmtFinType2 := "noBasement"]
full[is.na(FireplaceQu), FireplaceQu := "noFireplace"]
full[is.na(GarageType), GarageType := "noGarage"]
full[is.na(GarageFinish), GarageFinish := "noGarage"]
full[is.na(GarageQual), GarageQual := "noGarage"]
full[is.na(GarageCond), GarageCond := "noGarage"]
full[is.na(PoolQC) , PoolQC := "noPool"]
full[is.na(Fence), Fence := "noFence"]
full[is.na(MiscFeature), MiscFeature := "none"]
full[GarageYrBlt == 2207, GarageYrBlt := 2007]
# variable types
for (j in c(nomVars, ordVars)) full[[j]] <- as.factor(full[[j]])
for (j in contVars) full[[j]] <- as.numeric(full[[j]])
for (j in discVars) full[[j]] <- as.integer(full[[j]])
# order levels
lvlOrd <- list(
LotShape = c("Reg", "IR1", "IR2", "IR3"),
Utilities = c("AllPub", "NoSewr", "NoSeWa", "ELO"),
LandSlope = c("Gtl", "Mod", "Sev"),
BldgType = c("1Fam", "2FmCon", "Duplx", "TwnhsE", "TwnhsI"),
OverallQual = 10:1,
OverallCond = 10:1,
ExterQual = c("Ex", "Gd", "TA", "Fa", "Po"),
ExterCond = c("Ex", "Gd", "TA", "Fa", "Po"),
BsmtQual = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtCond = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtExposure = c("Gd", "Av", "Mn", "No", "noBasement"),
BsmtFinType1 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
BsmtFinType2 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
HeatingQC = c("Ex", "Gd", "TA", "Fa", "Po"),
KitchenQual = c("Ex", "Gd", "TA", "Fa", "Po"),
Functional = c("Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"),
FireplaceQu = c("Ex", "Gd", "TA", "Fa", "Po", "noFireplace"),
GarageFinish = c("Fin", "RFn", "Unf", "noGarage"),
GarageQual = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
GarageCond = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
PoolQC = c("Ex", "Gd", "TA", "Fa", "noPool"),
Fence = c("GdPrv", "MnPrv", "GdWo", "MnWw", "noFence"),
MoSold = 1:12
)
for (j in names(lvlOrd)) {
full[[j]] <- factor(full[[j]], levels = lvlOrd[[j]])
}
# undefined values
full[BsmtCond == "noBasement" & is.na(BsmtUnfSF), BsmtUnfSF := 0]
full[BsmtCond == "noBasement" & is.na(TotalBsmtSF), TotalBsmtSF := 0]
full[BsmtFinType1 == "noBasement" & is.na(BsmtFinSF1), BsmtFinSF1 := 0]
full[BsmtFinType2 == "noBasement" & is.na(BsmtFinSF2), BsmtFinSF2 := 0]
full[GarageType == "noGarage" & is.na(GarageYrBlt), GarageYrBlt := 0]
full[GarageType == "noGarage" & is.na(GarageCars), GarageCars := 0]
full[GarageType == "noGarage" & is.na(GarageArea), GarageArea := 0]
full[PoolQC == "noPool" & is.na(PoolArea), PoolArea := 0]
full[MiscFeature == "noFeature" & is.na(MiscVal), MiscVal := 0]
Missing Values
# single imputation
full[c(949, 1488, 2349), BsmtExposure := "No"]
full[c(2041, 2186, 2525), BsmtCond := "TA"]
full[c(2218, 2219), BsmtQual := "TA"]
full[c(2127, 2577), GarageYrBlt := 0]
full[c(2127, 2577), GarageCars := 0]
full[c(2127, 2577), GarageArea := 0]
full[c(2127, 2577), GarageType := "noGarage"]
full[is.na(Exterior1st), Exterior1st := "VinylSd"]
full[is.na(Exterior2nd), Exterior2nd := "VinylSd"]
full[is.na(Electrical), Electrical := "SBrkr"]
full[is.na(KitchenQual), KitchenQual := "TA"]
full[is.na(SaleType), SaleType := "WD"]
full[is.na(Utilities), Utilities := "AllPub"]
full[is.na(Functional), Functional := "Typ"]
full[is.na(MSZoning), MSZoning := "RL"]
full[is.na(MasVnrArea), MasVnrArea := 202]
full[is.na(MasVnrType), MasVnrType := "none"]
full[is.na(BsmtFullBath), BsmtFullBath := 0]
full[is.na(BsmtHalfBath), BsmtHalfBath := 0]
# transform
full$SalePrice <- log(full$SalePrice)
for (j in contVars) {
full[[j]] <- log10(full[[j]] + 1)
}
# multiple imputation
vars <- c(contVars, discVars, "BldgType", "MSSubClass", "HouseStyle")
imp <- mice(full[, vars, with = FALSE], seed = 42)
full$BldgType <- complete(imp)$BldgType
full$LotFrontage <- complete(imp)$LotFrontage
Important Features
impVars <- c("MSSubClass", "MSZoning", "LotFrontage", "LotArea", "LandContour",
"Neighborhood", "BldgType", "HouseStyle", "OverallQual",
"OverallCond", "YearBuilt", "YearRemodAdd", "Exterior1st",
"Exterior2nd", "MasVnrArea", "ExterQual", "Foundation",
"BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
"BsmtFinSF1", "BsmtUnfSF", "TotalBsmtSF", "HeatingQC",
"CentralAir", "X1stFlrSF", "X2ndFlrSF", "GrLivArea",
"BsmtFullBath", "FullBath", "HalfBath", "BedroomAbvGr",
"KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Fireplaces",
"FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish",
"GarageCars", "GarageArea", "GarageQual", "GarageCond",
"PavedDrive", "WoodDeckSF", "OpenPorchSF")
Adjust Features
# binary features
full$noAccess <- full$Alley == "noAccess"
full$noBasement <- full$BsmtQual == "noBasement" | full$BsmtCond == "noBasement"
full$noFireplace <- full$FireplaceQu == "noFireplace"
full$noGarage <- full$GarageType == "noGarage" | full$GarageCond == "noGarage"
full$noPool <- full$PoolQC == "noPool"
full$noFence <- full$Fence == "noFence"
full$noMisc <- full$MiscFeature == "none"
binVars <- c("noAccess", "noBasement", "noFireplace", "noGarage",
"noPool", "noFence", "noMisc")
# ordinal to integer
for (j in ordVars) {
full[[j]] <- as.integer(full[[j]])
}
The dataset is split back into training and test.
train <- full[label == "train", !c("Id", "label")]
rownames(train) <- full[label == "train", Id]
test <- full[label == "test", !c("Id", "label", "SalePrice")]
rownames(test) <- full[label == "test", Id]
Categorical variables are still coded as factors. Here, they are split up into binary dummy variables.
X <- sparse.model.matrix(~ -1 + ., data = train[, !"SalePrice"])
Y <- train$SalePrice
To see overfitting in the training, the training set must be split up again into a validation and a training set. The model will be repeatedly fitted on the training set, the progress is observed on the performance of the fitted model on the validation set. Here, a random 25% validation 75% training set split is chosen. The training set should be big to enable a good training, but the validation set must still be big enough to capture structures within the data and correctly estimate the performance of the fitted model. > I hope 25% is still big enough on 1460 data points
set.seed(42)
split <- sample(1:nrow(X), round(nrow(X) / 4))
dtrain <- xgb.DMatrix(data = X[-split, ], label = Y[-split])
dval <- xgb.DMatrix(data = X[split, ], label = Y[split])
Regression trees will be the base learners of this boosting model. The function to optimize is the objective which has a usual loss function and a function to penalize complexity. In the linear regression setting the loss function is a quadratic loss. (Doc says ‘linear regression’ so I assume it’s a quadratic loss.)
In a MART fashion the algorithm will add one tree after another, fitting the new trew to the pseudo-residuals of the previous one. Regularization for this process comes in many forms.
Parameter \(\eta\) (the stepsize) reduces the influence of each newly learned tree. It is closely related to \(L_1\) regularization in linear regression. \(\eta\) is usually set very small \(<0.1\). The smaller \(\eta\) is, the slower the model learns. So, more iterations are needed as well. The strategy here is to compute enough iterations for a given \(\eta\), watch the validation error, and then chose the number of iterations with the minimum validation error. (I start with 10000 iterations and eta = 0.01 )
The maximum allowed tree depth per model is another parameter which can limit the complexity of the model. High maximum tree depth (> 6) would represent interactions of many variables in the data. A MARS model that I fitted on the data earlier suggested no interactions. (Additive models did perform better than models with interaction terms) Therefore, small trees or stumps seem to be more approprate here. (I will look around a maxdepth of 2, 4, 6 first).
Some additional parameter which reduces overfitting is the subsampling rate. This is the proportion of data points which get sampled (without replacement) before fitting each tree. A usual value here is 50%. This reduces the learning process as well, but it also speeds up computation. Here is one initial run:
fit <- xgb.train(booster = "gbtree", data = dtrain, eta = .01, max.depth = 4,
subsample = .5, nround = 10000, objective = "reg:linear",
watchlist = list(train = dtrain, test = dval))
By default xgb.train already computes RMSE on the training and validation set in the lienar regression setting. Below is a plot of the learning process. 10000 iterations seem sufficient to reach minimum test error when \(\eta = 0.01\) and 50% subsampling is used.
dt <- melt(fit$evaluation_log, "iter", c("train_rmse", "test_rmse"))
ggplot(dt) +
geom_path(aes(x = iter, y = value, color = variable)) +
scale_y_continuous(limits = c(0, .2)) +
theme_bw()
There are some tools to inspect the model. The plot below shows the number of leafs and weighted number of observations (cover) which end up at certain tree depth. With this settings trees are mostly built to full (maximum) length.
One can prevent the formation of leafs by setting a minimum child weight. A split would not be performed if both partitions would not get a minimum number of observations. (As far as I know, in the regression setting the number of observations ending up in a single leaf can be directly translated to child weight.)
Appropriate numbers for minimum child weight have to be found. From the plot below, it seems that for a maximum tree depth of 4 it would not make sense to look for minimum child weights above around 80.
xgb.plot.deepness(model = fit)
There is another parameter \(\gamma\) which prevents trees from growing. It only allows to further grow a tree if a minimum increase in RMSE (here) is achieved by the partition.
Then, there are some parameters where I have no idea where to start. One is \(\alpha\), which is a \(L_1\) regularization for weights. The other is \(colsample_bytree\). This is another sampling method which samples a proportion of variables (columns in \(X\)) before building each tree.
Tree Depth
The maximum tree depth and minimum child weight have a huge influence on the model. First, their best values are found. Therefore, a first grid search in big steps is performed. This is the first grid:
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = c(2, 4, 6),
min_child_weight = c(1, 20, 40, 80),
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
Each parameter combination will be trained and the lowest RMSE and its iteration number for each parameter combination will be returned. An early stopping condition is added, so that the process stops if after 1000 iterations there has not been any decrease in RMSE.
In Kaggle these grid searches are commented
train.grid <- function(train, val, pars, n = 10000,
bstr = "gbtree", objFun = NULL, nstop = 1000) {
N <- nrow(pars)
out <- data.frame(iter = integer(N), rmse = numeric(N))
for (i in seq_len(N)) {
fit <- xgb.train(booster = bstr, data = train, eta = pars$eta[i],
gamma = pars$gamma[i], max.depth = pars$max_depth[i],
min_child_weight = pars$min_child_weight[i],
subsample = pars$subsample[i], alpha = pars$alpha[i],
colsample_bytree = pars$colsample_bytree[i],
nround = n, watchlist = list(train = dtrain, test = dval),
obj = objFun, early_stopping_rounds = nstop)
dt <- fit$evaluation_log
idx <- which.min(dt$test_rmse)
out$iter[i] <- idx
out$rmse[i] <- dt$test_rmse[idx]
}
return(out)
}
The training is done below. For the Kaggle kernel all these grid ssearches are commented. (They all run for a few minutes) The code below summarizes the results of all parameter combinations as a plot.
res <- train.grid(dtrain, dval, pars.grid)
df <- cbind(res, pars.grid)
df[which.min(df$rmse), ]
## iter rmse eta gamma max_depth min_child_weight alpha subsample
## 3 2089 0.100402 0.01 0 6 1 0 0.5
## colsample_bytree
## 3 0.8
ggplot(df, aes(x = min_child_weight, y = rmse)) +
geom_point(aes(color = as.factor(max_depth))) +
theme_bw()
Minimum child weight of 0 is the best for all tree depths. The maximum tree depth parameter performs differently depending on the minimum child weight, but for child weight 0, 6 is best. In the next grid a smaller area of the same parameter space is searched.
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = c(5, 6, 7),
min_child_weight = c(1, 5, 10),
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
res <- train.grid(dtrain, dval, pars.grid)
df <- cbind(res, pars.grid)
df[which.min(df$rmse), ]
## iter rmse eta gamma max_depth min_child_weight alpha subsample
## 6 1665 0.100317 0.01 0 7 5 0 0.5
## colsample_bytree
## 6 0.8
ggplot(df, aes(x = min_child_weight, y = rmse)) +
geom_point(aes(color = as.factor(max_depth))) +
theme_bw()
A minimum child weight of 5 with a maximum tree depth of 7 performs best. Again, child weight has the highest influence.
Regularization
Next, \(\gamma\) and \(\alpha\) are tested.
pars.grid <- expand.grid(
eta = 0.01,
gamma = c(0, 0.1, 0.2, 0.3),
max_depth = 7,
min_child_weight = 5,
alpha = c(0, 0.3, 0.7),
subsample = 0.5,
colsample_bytree = 0.8
)
res <- train.grid(dtrain, dval, pars.grid)
df <- cbind(res, pars.grid)
df[which.min(df$rmse), ]
## iter rmse eta gamma max_depth min_child_weight alpha subsample
## 1 1413 0.102517 0.01 0 7 5 0 0.5
## colsample_bytree
## 1 0.8
ggplot(df, aes(x = gamma, y = rmse)) +
geom_point(aes(color = as.factor(alpha))) +
theme_bw()
\(\gamma = 0\) and \(\alpha = 0\) performed best. The differences here are in the range of 0.004 RMSE.
Subsampling
Next, row and column subsampling is analyzed.
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = 7,
min_child_weight = 5,
alpha = 0,
subsample = c(0.3, 0.5, 0.8),
colsample_bytree = c(0.3, 0.5, 0.8)
)
res <- train.grid(dtrain, dval, pars.grid)
df <- cbind(res, pars.grid)
df[which.min(df$rmse), ]
## iter rmse eta gamma max_depth min_child_weight alpha subsample
## 7 1928 0.099853 0.01 0 7 5 0 0.3
## colsample_bytree
## 7 0.8
ggplot(df, aes(x = colsample_bytree, y = rmse)) +
geom_point(aes(color = as.factor(subsample))) +
theme_bw()
80% column subsampling by tree with 30% subsampling performed best. The differences are now in the range of 0.001 RMSE. The best validation RMSE is now at 0.098.
Learning Rate
Finally, \(\eta\) is set down further and the number of iterations is increased. The model learns really slow now, so the stopping condition is increased to 5000.
pars1 <- list(
eta = 0.005,
gamma = 0,
max_depth = 7,
min_child_weight = 5,
alpha = 0,
subsample = 0.3,
colsample_bytree = 0.8
)
fit <- xgb.train(params = pars1, data = dtrain, nround = 100000,
objective = "reg:linear", early_stopping_rounds = 5000,
watchlist = list(train = dtrain, test = dval))
This model reached a minimum RMSE of 0.098 at around 20000 iterations. The improvement is something below 0.0001 RMSE.
dt <- melt(fit$evaluation_log, "iter", c("train_rmse", "test_rmse"))
min(dt$value)
## [1] 0.017142
ggplot(dt) +
geom_path(aes(x = iter, y = value, color = variable)) +
scale_y_continuous(limits = c(0, .2)) +
theme_bw()
## Warning: Removed 1755 rows containing missing values (geom_path).
That’s not really worth it, so I keep the model with \(\eta = 0.01\) which reached its minimum validation RMSE after 9887 iterations.
pars1 <- list(
eta = 0.01,
gamma = 0,
max_depth = 7,
min_child_weight = 5,
alpha = 0,
subsample = 0.3,
colsample_bytree = 0.8
)
Quadratic loss can be sensitive to outliers. Other loss functions, such as the Huber loss, become linear with increasing residual vlaues. xgb.train allows to provide custom loss functions.
The function below should describe a Huber loss. It takes predictions and the training object and returns first and second order gradients.
Huber <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
grad <- ifelse(abs(preds - labels < 1.5),
preds - labels,
1.5 * sign(preds - labels))
hess <- ifelse(abs(preds - labels < 1.5), 1, 0)
return(list(grad = grad, hess = hess))
}
Now the same parameter search as above is performed again.
Tree Depth
Again a few initial parameters are searched.
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = c(2, 4, 6),
min_child_weight = c(1, 20, 40, 80),
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
res <- train.grid(dtrain, dval, pars.grid, objFun = Huber)
df <- cbind(res, pars.grid)
df[which.min(df$rmse), ]
## iter rmse eta gamma max_depth min_child_weight alpha subsample
## 2 4414 0.101358 0.01 0 4 1 0 0.5
## colsample_bytree
## 2 0.8
ggplot(df, aes(x = min_child_weight, y = rmse)) +
geom_point(aes(color = as.factor(max_depth))) +
theme_bw()
As before, minimum child weight is the main parameter. Maximum depth of 6 and minimum child weight of 1 give the best training error. In the next grid a smaller area of the same parameter space is searched.
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = c(5, 6, 7),
min_child_weight = c(1, 5, 10),
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
res <- train.grid(dtrain, dval, pars.grid, objFun = Huber)
Maximum depth of 6 and minimum child weight of 5 wins.
df <- cbind(res, pars.grid)
df[which.min(df$rmse), ]
## iter rmse eta gamma max_depth min_child_weight alpha subsample
## 4 3692 0.100468 0.01 0 5 5 0 0.5
## colsample_bytree
## 4 0.8
ggplot(df, aes(x = min_child_weight, y = rmse)) +
geom_point(aes(color = as.factor(max_depth))) +
theme_bw()
Regularization
Next, \(\gamma\) and \(\alpha\) are tested.
pars.grid <- expand.grid(
eta = 0.01,
gamma = c(0, 0.1, 0.2, 0.3),
max_depth = 6,
min_child_weight = 5,
alpha = c(0, 0.3, 0.7),
subsample = 0.5,
colsample_bytree = 0.8
)
res <- train.grid(dtrain, dval, pars.grid, objFun = Huber)
df <- cbind(res, pars.grid)
df[which.min(df$rmse), ]
## iter rmse eta gamma max_depth min_child_weight alpha subsample
## 1 2460 0.102275 0.01 0 6 5 0 0.5
## colsample_bytree
## 1 0.8
ggplot(df, aes(x = gamma, y = rmse)) +
geom_point(aes(color = as.factor(alpha))) +
theme_bw()
\(\gamma = 0\) and \(\alpha = 0.3\) wins.
Subsampling
Next, row and column subsampling is analyzed.
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = 6,
min_child_weight = 5,
alpha = 0.3,
subsample = c(0.3, 0.5, 0.8),
colsample_bytree = c(0.3, 0.5, 0.8)
)
res <- train.grid(dtrain, dval, pars.grid, objFun = Huber)
50% row and 80% column subsampling wins.
df <- cbind(res, pars.grid)
df[which.min(df$rmse), ]
## iter rmse eta gamma max_depth min_child_weight alpha subsample
## 5 2318 0.101716 0.01 0 6 5 0.3 0.5
## colsample_bytree
## 5 0.5
ggplot(df, aes(x = colsample_bytree, y = rmse)) +
geom_point(aes(color = as.factor(subsample))) +
theme_bw()
Learning Rate
Again it is tested whether slower learning rate increases performence. The model learns really slow now, so the stopping condition is increased to 5000.
pars2 <- list(
eta = 0.005,
gamma = 0,
max_depth = 6,
min_child_weight = 5,
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
fit <- xgb.train(params = pars2, data = dtrain, nround = 100000,
early_stopping_rounds = 5000, obj = Huber,
watchlist = list(train = dtrain, test = dval))
This time the test RMSE went below 0.098 after 36716 iterations.
dt <- melt(fit$evaluation_log, "iter", c("train_rmse", "test_rmse"))
min(dt$value)
## [1] 0.004435
ggplot(dt) +
geom_path(aes(x = iter, y = value, color = variable)) +
scale_y_continuous(limits = c(0, .2)) +
theme_bw()
## Warning: Removed 1733 rows containing missing values (geom_path).
I will keep this model.
pars2 <- list(
eta = 0.005,
gamma = 0,
max_depth = 6,
min_child_weight = 5,
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
To correctly estimate the RMSE of this method, the whole procedure (including grid search and parameter selection) would need to be plugged into a repeated cross-validation. Again, this is not feasible in the Kaggle kernel.
Here, only the two models quadratic loss with pars1 and Huber loss with pars2 are compared in a cross validation. Since these were derived from 75% of the whole training set the RMSE predictions will be biased downward (better RMSE). Nevertheless, the results can be used to chose one of the two methods. For that the full training set is restored.
dtrain <- xgb.DMatrix(data = X, label = Y)
Inside the cross validation, the model is fit with the specified parameters for the specified number of rounds.
RMSE <- function(Y, Yh) sqrt(sum((Y - Yh)^2) / length(Y))
predXGB <- function(trainX, trainY, testX, testY, ...) {
dtrain <- xgb.DMatrix(data = trainX, label = trainY)
fit <- xgb.train(params = pars, data = dtrain, ...)
pred <- predict(fit, testX)
return(RMSE(testY, pred))
}
The 5-fold cross validation is repeated 3 times. It returns mean RMSE and its standard error.
CV <- list()
pars <- pars1
CV$Quad <- crossval(predXGB, X, Y, nround = 9887, K = 5, B = 3)
The same is repeated for the Huber loss model.
pars <- pars2
CV$Huber <- crossval(predXGB, X, Y, nround = 36716, K = 5, B = 3, obj = Huber)
Mean RMSEs and their standard error are shown below. It is obvious that the models were overfitted to the training set. The mean RMSE by cross validation is slightly below 0.12. The standard errors show, that the optimization in the range below 0.01 RMSE is questionable. None of both models is significantly better than the other.
l <- lapply(CV, function(i) data.frame(RMSE = i$stat, se = i$stat.se))
res <- do.call(rbind, l)
res$Model <- rownames(res)
ggplot(res, aes(x = Model, y = RMSE)) +
geom_pointrange(aes(ymax = RMSE + se, ymin = RMSE - se), color = "blue") +
coord_flip() +
ggtitle("CV-Estimated RMSE with Standard Error") +
theme_classic()
The final model is fit to the training data.
dX <- xgb.DMatrix(data = X, label = Y)
fit <- xgb.train(params = pars1, data = dX, nround = 9887)
Feature Importance
There are a few tools for inspecting the final model. For example feature imprtance can be estimated by the weighted abundance of each feature in the model. The 20 most important features are shown below. Like in the MARS model that I trained before GrLivArea and OverallQual are the two most imprtant features by far. The rank of features below however differes compared to the MARS model.
dt <- xgb.importance(colnames(dX), fit)
xgb.plot.importance(dt[1:20, ])
Tree Structures
There is a tool that represents the tree ensemble as a single merged tree. Since the trees here have a maximum depth of 3, it should still be reasonable to plot it. Nodes in the graph is represent leafs and splits. Nodes with only one vertice represent leafs.
xgb.plot.multi.trees(fit, colnames(dX))
Partial Dependency Plots
The ICEbox package offers functions for partial dependency plots. With partial dependency plots one can visualize the influence of a single predictor variable on the target variable. The target is plotted with the desired predictor. Since the target is also dependent on the other predictors, which are not shown, the expected value of the target over all unseen predictors is shown.
Below are the partial dependency plots for the 3 most important features. Note that the target SalePrice is still log transformed. Only 50% of the data points in \(X\) are used. This speeds up computation.
X <- model.matrix(~ -1 + ., data = train[, !"SalePrice"])
Y <- train$SalePrice
xgb.ice <- ice(fit, X, Y, "GrLivArea", frac_to_build = 0.5)
plot(xgb.ice)
xgb.ice <- ice(fit, X, Y, "OverallQual", frac_to_build = 0.5)
plot(xgb.ice)
xgb.ice <- ice(fit, X, Y, "TotalBsmtSF", frac_to_build = 0.5)
plot(xgb.ice)
Finally the test set can be predicted.
Xtest <- sparse.model.matrix(~ -1 + ., data = test)
pred <- predict(fit, Xtest)
The distribution of predicted sale prices is shown below.
df <- data.frame(
Id = full[label == "test", Id],
SalePrice = exp(pred)
)
summary(df$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 39850 128100 156400 177400 210300 531500
ggplot(df, aes(x = SalePrice)) + geom_histogram(bins = 30) + theme_bw()
That’s it! Thanks for reading through this whole thing. If you have any suggestions, I’m quite new to XGBoost, comments are welcome.
write.csv(df, file = "XGB_submission.csv", row.names = FALSE, quote = FALSE)